U.S. Pollution Data¶
Pollution in the U.S. since 2000¶
External Kaggle Notebooks¶
US Pollution Project
## Import Librariesimport pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stat
poll = pd.read_csv("../input/uspollution/pollution_us_2000_2016.csv")
poll.info
poll.head
poll.dtypes
poll.isna().sum()
poll.count()
poll = poll.drop(['Unnamed: 0','State Code','County Code','Site Num','Address','NO2 Units','O3 Units','SO2 Units','CO Units'],axis=1)
poll.head()
| State | County | City | Date Local | NO2 Mean | NO2 1st Max Value | NO2 1st Max Hour | NO2 AQI | O3 Mean | O3 1st Max Value | O3 1st Max Hour | O3 AQI | SO2 Mean | SO2 1st Max Value | SO2 1st Max Hour | SO2 AQI | CO Mean | CO 1st Max Value | CO 1st Max Hour | CO AQI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arizona | Maricopa | Phoenix | 2000-01-01 | 19.041667 | 49.0 | 19 | 46 | 0.022500 | 0.040 | 10 | 34 | 3.000000 | 9.0 | 21 | 13.0 | 1.145833 | 4.2 | 21 | NaN |
| 1 | Arizona | Maricopa | Phoenix | 2000-01-01 | 19.041667 | 49.0 | 19 | 46 | 0.022500 | 0.040 | 10 | 34 | 3.000000 | 9.0 | 21 | 13.0 | 0.878947 | 2.2 | 23 | 25.0 |
| 2 | Arizona | Maricopa | Phoenix | 2000-01-01 | 19.041667 | 49.0 | 19 | 46 | 0.022500 | 0.040 | 10 | 34 | 2.975000 | 6.6 | 23 | NaN | 1.145833 | 4.2 | 21 | NaN |
| 3 | Arizona | Maricopa | Phoenix | 2000-01-01 | 19.041667 | 49.0 | 19 | 46 | 0.022500 | 0.040 | 10 | 34 | 2.975000 | 6.6 | 23 | NaN | 0.878947 | 2.2 | 23 | 25.0 |
| 4 | Arizona | Maricopa | Phoenix | 2000-01-02 | 22.958333 | 36.0 | 19 | 34 | 0.013375 | 0.032 | 10 | 27 | 1.958333 | 3.0 | 22 | 4.0 | 0.850000 | 1.6 | 23 | NaN |
## Prepare all 4 AQIs against state and date
pollSt = poll[['State','Date Local','NO2 AQI','O3 AQI','SO2 AQI','CO AQI']]
pollSt = pollSt.dropna(axis='rows') # Delete rows with NAs
pollSt = pollSt[pollSt.State!='Country Of Mexico'] # Delete Mexico
pollSt['Date Local'] = pd.to_datetime(pollSt['Date Local'],format='%Y-%m-%d') # Change date from string to date value
pollSt = pollSt.groupby(['State','Date Local']).mean() # Take mean values if there are depulicated entries
pollStGrouped = pollSt.groupby(level=0)
pollSt.info
pollSt.tail(5)
| NO2 AQI | O3 AQI | SO2 AQI | CO AQI | ||
|---|---|---|---|---|---|
| State | Date Local | ||||
| Wyoming | 2016-03-27 | 22.0 | 46.0 | 0.0 | 1.0 |
| 2016-03-28 | 21.0 | 48.0 | 0.0 | 1.0 | |
| 2016-03-29 | 3.0 | 37.0 | 0.0 | 1.0 | |
| 2016-03-30 | 1.0 | 44.0 | 0.0 | 1.0 | |
| 2016-03-31 | 1.0 | 44.0 | 0.0 | 1.0 |
pollSt.head(15)
| NO2 AQI | O3 AQI | SO2 AQI | CO AQI | ||
|---|---|---|---|---|---|
| State | Date Local | ||||
| Alabama | 2013-12-01 | 37.0 | 24.0 | 1.0 | 6.0 |
| 2013-12-02 | 30.0 | 12.0 | 3.0 | 6.0 | |
| 2013-12-03 | 21.0 | 11.0 | 3.0 | 3.0 | |
| 2013-12-04 | 18.0 | 13.0 | 1.0 | 2.0 | |
| 2013-12-05 | 15.0 | 13.0 | 1.0 | 2.0 | |
| 2013-12-06 | 9.0 | 6.0 | 0.0 | 2.0 | |
| 2013-12-07 | 8.0 | 9.0 | 0.0 | 2.0 | |
| 2013-12-08 | 9.0 | 0.0 | 4.0 | 3.0 | |
| 2013-12-09 | 8.0 | 7.0 | 0.0 | 3.0 | |
| 2013-12-10 | 26.0 | 19.0 | 1.0 | 3.0 | |
| 2013-12-11 | 26.0 | 17.0 | 17.0 | 5.0 | |
| 2013-12-12 | 26.0 | 25.0 | 11.0 | 3.0 | |
| 2013-12-13 | 33.0 | 18.0 | 17.0 | 6.0 | |
| 2013-12-14 | 20.0 | 20.0 | 7.0 | 2.0 | |
| 2013-12-15 | 8.0 | 20.0 | 0.0 | 1.0 |
pollSt = pollSt.rename(columns={'NO2 AQI': 'NO2_AQI', 'O3 AQI': 'O3_AQI', 'SO2 AQI': 'SO2_AQI', 'CO AQI': 'CO_AQI' })
pollSt.head(5)
| NO2_AQI | O3_AQI | SO2_AQI | CO_AQI | ||
|---|---|---|---|---|---|
| State | Date Local | ||||
| Alabama | 2013-12-01 | 37.0 | 24.0 | 1.0 | 6.0 |
| 2013-12-02 | 30.0 | 12.0 | 3.0 | 6.0 | |
| 2013-12-03 | 21.0 | 11.0 | 3.0 | 3.0 | |
| 2013-12-04 | 18.0 | 13.0 | 1.0 | 2.0 | |
| 2013-12-05 | 15.0 | 13.0 | 1.0 | 2.0 |
pollSt.corr()
| NO2_AQI | O3_AQI | SO2_AQI | CO_AQI | |
|---|---|---|---|---|
| NO2_AQI | 1.000000 | 0.025708 | 0.397214 | 0.598608 |
| O3_AQI | 0.025708 | 1.000000 | 0.024482 | -0.126909 |
| SO2_AQI | 0.397214 | 0.024482 | 1.000000 | 0.335044 |
| CO_AQI | 0.598608 | -0.126909 | 0.335044 | 1.000000 |
# Finding the relations between the variables.
plt.figure(figsize=(20,10))
c= pollSt.corr()
sns.heatmap(c,cmap='BrBG',annot=True)
c
| NO2_AQI | O3_AQI | SO2_AQI | CO_AQI | |
|---|---|---|---|---|
| NO2_AQI | 1.000000 | 0.025708 | 0.397214 | 0.598608 |
| O3_AQI | 0.025708 | 1.000000 | 0.024482 | -0.126909 |
| SO2_AQI | 0.397214 | 0.024482 | 1.000000 | 0.335044 |
| CO_AQI | 0.598608 | -0.126909 | 0.335044 | 1.000000 |
# Plotting a scatter plot
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(pollSt['CO_AQI'], pollSt['NO2_AQI'])
ax.set_xlabel('CO_AQI')
ax.set_ylabel('NO2_AQI')
plt.show()
USA Pollution - analysis of AQI index
USA Pollution - analysis of AQI index
# 0. Import packagesimport numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
df = pd.read_csv("../input/uspollution/pollution_us_2000_2016.csv")
df.head()
| Unnamed: 0 | State Code | County Code | Site Num | Address | State | County | City | Date Local | NO2 Units | ... | SO2 Units | SO2 Mean | SO2 1st Max Value | SO2 1st Max Hour | SO2 AQI | CO Units | CO Mean | CO 1st Max Value | CO 1st Max Hour | CO AQI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-01 | Parts per billion | ... | Parts per billion | 3.000000 | 9.0 | 21 | 13.0 | Parts per million | 1.145833 | 4.2 | 21 | NaN |
| 1 | 1 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-01 | Parts per billion | ... | Parts per billion | 3.000000 | 9.0 | 21 | 13.0 | Parts per million | 0.878947 | 2.2 | 23 | 25.0 |
| 2 | 2 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-01 | Parts per billion | ... | Parts per billion | 2.975000 | 6.6 | 23 | NaN | Parts per million | 1.145833 | 4.2 | 21 | NaN |
| 3 | 3 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-01 | Parts per billion | ... | Parts per billion | 2.975000 | 6.6 | 23 | NaN | Parts per million | 0.878947 | 2.2 | 23 | 25.0 |
| 4 | 4 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-02 | Parts per billion | ... | Parts per billion | 1.958333 | 3.0 | 22 | 4.0 | Parts per million | 0.850000 | 1.6 | 23 | NaN |
5 rows × 29 columns
# Deleting first column
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.shape
df.isna().sum()
df.describe(include = ['object'])
| Address | State | County | City | Date Local | NO2 Units | O3 Units | SO2 Units | CO Units | |
|---|---|---|---|---|---|---|---|---|---|
| count | 1746661 | 1746661 | 1746661 | 1746661 | 1746661 | 1746661 | 1746661 | 1746661 | 1746661 |
| unique | 204 | 47 | 133 | 144 | 5996 | 1 | 1 | 1 | 1 |
| top | PIKE AVE AT RIVER ROAD | California | Los Angeles | Not in a city | 2002-06-10 | Parts per billion | Parts per million | Parts per billion | Parts per million |
| freq | 35332 | 576142 | 93381 | 138411 | 640 | 1746661 | 1746661 | 1746661 | 1746661 |
df.dtypes
df['Date Local'] = pd.to_datetime(df['Date Local'])
# TOP 10 biggest polluters by mean value of AQI for Nitrogen Dioxide (NO2), Ozone (O3), Sulfur Dioxide (SO2) and Carbon Monoxide (CO) for 2000-2016
# Calculating mean value of pollutants for every State
df_AQI = df[['State', 'Date Local', 'NO2 AQI', 'O3 AQI', 'SO2 AQI', 'CO AQI']]
df_AQI_State = df_AQI.groupby('State').mean()
df_AQI_State.reset_index(inplace=True)
# Charts size
plt.rcParams["figure.figsize"] = (20, 15)
# Creating x variable for 10 biggest polluters
x = np.arange(10)
# Adding four charts
fig, axs = plt.subplots(4, 1)
# Plot for NO2
df_AQI_State.sort_values(by = 'NO2 AQI', ascending = False, inplace = True)
barplot1 = axs[0].bar(x, 'NO2 AQI', data=df_AQI_State[:10], label = 'NO2')
xlabels = df_AQI_State.loc[:10, 'State']
plt.sca(axs[0])
plt.xticks(x, xlabels) # x axis marks
# Plot for O3
df_AQI_State.sort_values(by = 'O3 AQI', ascending = False, inplace = True)
barplot2 = axs[1].bar(x, 'O3 AQI', data=df_AQI_State[:10], label = 'O3', color = 'red')
xlabels = df_AQI_State.loc[:10, 'State']
plt.sca(axs[1])
plt.xticks(x, xlabels) # x axis marks
# Plot for SO2
df_AQI_State.sort_values(by = 'SO2 AQI', ascending = False, inplace = True)
barplot3 = axs[2].bar(x, 'SO2 AQI', data=df_AQI_State[:10], label = 'SO2', color = 'green')
xlabels = df_AQI_State.loc[:10, 'State']
plt.sca(axs[2])
plt.xticks(x, xlabels) # x axis marks
# Plot for CO
df_AQI_State.sort_values(by = 'CO AQI', ascending = False, inplace = True)
barplot4 = axs[3].bar(x, 'CO AQI', data=df_AQI_State[:10], label = 'CO', color = 'purple')
xlabels = df_AQI_State.loc[:10, 'State']
plt.sca(axs[3])
plt.xticks(x, xlabels) # x axis marks
# change of font size on both axes
axs[0].tick_params(axis='both', which='major', labelsize=12)
axs[1].tick_params(axis='both', which='major', labelsize=12)
axs[2].tick_params(axis='both', which='major', labelsize=12)
axs[3].tick_params(axis='both', which='major', labelsize=12)
# Adding value for each bar
i = 0
def autolabel(rects):
"""Attach a text label above each bar in *rects*, displaying its height."""
global i
for rect in rects:
height = rect.get_height()
axs[i].annotate('{:.2f}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom', size = 12)
i = i + 1
autolabel(barplot1)
autolabel(barplot2)
autolabel(barplot3)
autolabel(barplot4)
# Space between charts
plt.subplots_adjust(hspace=0.3)
# Adding plot titles and axis titles
axs[0].set_title('TOP 10 biggest polluters by mean value of AQI for Nitrogen Dioxide (NO2) for 2000-2016', fontsize = 14)
axs[1].set_title('TOP 10 biggest polluters by mean value of AQI for Ozone (O3) for 2000-2016', fontsize = 14)
axs[2].set_title('TOP 10 biggest polluters by mean value of AQI for Sulfur Dioxide (SO2) for 2000-2016', fontsize = 14)
axs[3].set_title('TOP 10 biggest polluters by mean value of AQI for Carbon Monoxide (CO) for 2000-2016', fontsize = 14)
fig.text(0.1, 0.5, 'Mean value of AQI', ha='center', va='center', rotation='vertical', fontsize = 14)
# Mean value of pollutants for USA in 2000-2016
# Data
df_AQI_Year = df_AQI[['Date Local', 'NO2 AQI', 'O3 AQI', 'SO2 AQI', 'CO AQI']].resample('Y', on = 'Date Local').mean()
df_AQI_Year.reset_index(inplace=True)
# Plot size
plt.rcParams["figure.figsize"] = (12, 8)
# Adding plots
fig, ax = plt.subplots()
ax.plot('Date Local', 'NO2 AQI', data=df_AQI_Year, color = "red", label = "NO2 AQI")
ax.plot('Date Local', 'O3 AQI', data=df_AQI_Year, color = "blue", label = "O3 AQI")
ax.plot('Date Local', 'SO2 AQI', data=df_AQI_Year, color = "green", label = "SO2 AQI")
ax.plot('Date Local', 'CO AQI', data=df_AQI_Year, color = "purple", label = "CO AQI")
# Adding plot title and axis titles
plt.title('Mean value of pollutants for USA in 2000-2016', fontsize = 16)
plt.xlabel("Year", fontsize = 13)
plt.ylabel("Mean value of AQI", fontsize = 13)
# change of font size on both axes
ax.tick_params(axis='both', which='major', labelsize=11)
# Adding legend
ax.legend()
ax.legend(loc='lower center', bbox_to_anchor=(0.5, -.15), ncol = 6, shadow = True, fontsize = 13)
## To use plotly choropleth maps, states names must be encoded.
us_state_abbrev = {
'Alabama': 'AL',
'Alaska': 'AK',
'Arizona': 'AZ',
'Arkansas': 'AR',
'California': 'CA',
'Colorado': 'CO',
'Connecticut': 'CT',
'Delaware': 'DE',
'Florida': 'FL',
'Georgia': 'GA',
'Hawaii': 'HI',
'Idaho': 'ID',
'Illinois': 'IL',
'Indiana': 'IN',
'Iowa': 'IA',
'Kansas': 'KS',
'Kentucky': 'KY',
'Louisiana': 'LA',
'Maine': 'ME',
'Maryland': 'MD',
'Massachusetts': 'MA',
'Michigan': 'MI',
'Minnesota': 'MN',
'Mississippi': 'MS',
'Missouri': 'MO',
'Montana': 'MT',
'Nebraska': 'NE',
'Nevada': 'NV',
'New Hampshire': 'NH',
'New Jersey': 'NJ',
'New Mexico': 'NM',
'New York': 'NY',
'North Carolina': 'NC',
'North Dakota': 'ND',
'Ohio': 'OH',
'Oklahoma': 'OK',
'Oregon': 'OR',
'Pennsylvania': 'PA',
'Rhode Island': 'RI',
'South Carolina': 'SC',
'South Dakota': 'SD',
'Tennessee': 'TN',
'Texas': 'TX',
'Utah': 'UT',
'Vermont': 'VT',
'Virginia': 'VA',
'Washington': 'WA',
'West Virginia': 'WV',
'Wisconsin': 'WI',
'Wyoming': 'WY',
}
df = df[df['State']!='Country Of Mexico'] # deleting Mexico
df = df[df['State']!='District Of Columbia'] # deleting Columbia
df['State_abbrev'] = df.State.apply(lambda x: us_state_abbrev[x])
# Calculating annual mean value of pollutants for every State
df_AQI = df[['State_abbrev', 'Date Local', 'NO2 AQI', 'O3 AQI', 'SO2 AQI', 'CO AQI']]
df_AQI_State_Year = df_AQI.groupby('State_abbrev').resample('Y', on = 'Date Local').mean()
df_AQI_State_Year.reset_index(inplace = True)
df_AQI_State_Year
| State_abbrev | Date Local | NO2 AQI | O3 AQI | SO2 AQI | CO AQI | |
|---|---|---|---|---|---|---|
| 0 | AK | 2014-12-31 | 21.167598 | 15.206704 | 14.000000 | 6.983240 |
| 1 | AK | 2015-12-31 | 18.634340 | 19.158983 | 14.764706 | 6.269841 |
| 2 | AL | 2013-12-31 | 21.387097 | 18.903226 | 6.580645 | 4.129032 |
| 3 | AL | 2014-12-31 | 21.495854 | 36.983416 | 7.956954 | 3.711443 |
| 4 | AL | 2015-12-31 | 20.026667 | 37.343333 | 6.950000 | 3.933333 |
| ... | ... | ... | ... | ... | ... | ... |
| 515 | WY | 2012-12-31 | 11.709722 | 40.455556 | 3.986111 | 2.636111 |
| 516 | WY | 2013-12-31 | 11.206416 | 41.295676 | 0.687587 | 0.824022 |
| 517 | WY | 2014-12-31 | 8.973533 | 44.806674 | 0.547756 | 0.442396 |
| 518 | WY | 2015-12-31 | 9.368617 | 43.015785 | 1.894150 | 0.713755 |
| 519 | WY | 2016-12-31 | 10.288889 | 41.077778 | 0.200000 | 1.155556 |
520 rows × 6 columns
# Adding column coresponding to the year of Date Local column
df_AQI_State_Year['Year'] = df_AQI_State_Year['Date Local'].dt.year
# Sorting values by Date Local (for animated choropleth presented below)
df_AQI_State_Year.sort_values(by = 'Date Local', inplace = True)
# Plotly choropleth showing AQI for Nitrogen Dioxide changes from 2000 to 2016
fig_NO2 = px.choropleth(df_AQI_State_Year,
locations = 'State_abbrev',
animation_frame="Year", # showing changes through the years
color='NO2 AQI',
# Creating fixed scale (the same as defined by EPA)
color_continuous_scale = [(0.00, "green"), (0.1, "green"),
(0.1, "yellow"), (0.2, "yellow"),
(0.2, "orange"), (0.3, "orange"),
(0.3, "red"), (0.4, "red"),
(0.4, "purple"), (0.6, "purple"),
(0.6, "maroon"), (1.00, "maroon"),
],
range_color = (0, 500),
locationmode='USA-states',
scope="usa",
title='Mean values of Air Quality Index (AQI) per year for Nitrogen Dioxide (NO2)',
height=600,
)
# Modifying legend
fig_NO2.update_layout(coloraxis_colorbar=dict(
title="Air Quality Index (AQI)",
ticks="outside",
dtick=50
))
# Plotly choropleth showing AQI for Ozone changes from 2000 to 2016
fig_O3 = px.choropleth(df_AQI_State_Year,
locations = 'State_abbrev',
animation_frame="Year", # showing changes through the years
color='O3 AQI',
# Creating fixed scale (the same as defined by EPA)
color_continuous_scale = [(0.00, "green"), (0.1, "green"),
(0.1, "yellow"), (0.2, "yellow"),
(0.2, "orange"), (0.3, "orange"),
(0.3, "red"), (0.4, "red"),
(0.4, "purple"), (0.6, "purple"),
(0.6, "maroon"), (1.00, "maroon"),
],
range_color = (0, 500),
locationmode='USA-states',
scope="usa",
title='Mean values of Air Quality Index (AQI) per year for Ozone (O3)',
height=600,
)
# Modifying legend
fig_O3.update_layout(coloraxis_colorbar=dict(
title="Air Quality Index (AQI)",
ticks="outside",
dtick=50
))
# Plotly choropleth showing AQI for Sulfur Dioxide changes from 2000 to 2016
fig_SO2 = px.choropleth(df_AQI_State_Year,
locations = 'State_abbrev',
animation_frame="Year", # showing changes through the years
color='SO2 AQI',
# Creating fixed scale (the same as defined by EPA)
color_continuous_scale = [(0.00, "green"), (0.1, "green"),
(0.1, "yellow"), (0.2, "yellow"),
(0.2, "orange"), (0.3, "orange"),
(0.3, "red"), (0.4, "red"),
(0.4, "purple"), (0.6, "purple"),
(0.6, "maroon"), (1.00, "maroon"),
],
range_color = (0, 500),
locationmode='USA-states',
scope="usa",
title='Mean values of Air Quality Index (AQI) per year for Sulfur Dioxide (SO2)',
height=600,
)
# Modifying legend
fig_SO2.update_layout(coloraxis_colorbar=dict(
title="Air Quality Index (AQI)",
ticks="outside",
dtick=50
))
# Plotly choropleth showing AQI for Carbon Monoxide changes from 2000 to 2016
fig_CO = px.choropleth(df_AQI_State_Year,
locations = 'State_abbrev',
animation_frame="Year", # showing changes through the years
color='CO AQI',
# Creating fixed scale (the same as defined by EPA)
color_continuous_scale = [(0.00, "green"), (0.1, "green"),
(0.1, "yellow"), (0.2, "yellow"),
(0.2, "orange"), (0.3, "orange"),
(0.3, "red"), (0.4, "red"),
(0.4, "purple"), (0.6, "purple"),
(0.6, "maroon"), (1.00, "maroon"),
],
range_color = (0, 500),
locationmode='USA-states',
scope="usa",
title='Mean values of Air Quality Index (AQI) per year for Carbon Monoxide (CO)',
height=600,
)
# Modifying legend
fig_CO.update_layout(coloraxis_colorbar=dict(
title="Air Quality Index (AQI)",
ticks="outside",
dtick=50
))
# Calculating daily mean value of pollutants for every State
df_AQI_State_Month = df_AQI.groupby('State_abbrev').resample('D', on = 'Date Local').mean()
df_AQI_State_Month.reset_index(inplace = True)
# Adding columns coresponding to the day, month and year of Date Local column
df_AQI_State_Month['Day'] = df_AQI_State_Month['Date Local'].dt.day
df_AQI_State_Month['Month'] = df_AQI_State_Month['Date Local'].dt.month
df_AQI_State_Month['Year'] = df_AQI_State_Month['Date Local'].dt.year
# Data for July in 2005
df_AQI_State_2005_July = df_AQI_State_Month[(df_AQI_State_Month['Year'] == 2005) & (df_AQI_State_Month['Month'] == 7)]
# Sorting values by Date Local (for animated choropleth presented below)
df_AQI_State_2005_July.sort_values(by = 'Date Local', inplace = True)
# Plotly choropleth showing AQI for Ozone changes for July 2005
fig_O3_v2 = px.choropleth(df_AQI_State_2005_July,
locations = 'State_abbrev',
animation_frame="Day", # showing changes through the days
color='O3 AQI',
# Creating fixed scale (the same as defined by EPA)
color_continuous_scale = [(0.00, "green"), (0.1, "green"),
(0.1, "yellow"), (0.2, "yellow"),
(0.2, "orange"), (0.3, "orange"),
(0.3, "red"), (0.4, "red"),
(0.4, "purple"), (0.6, "purple"),
(0.6, "maroon"), (1.00, "maroon"),
],
range_color = (0, 500),
locationmode='USA-states',
scope="usa",
title='Mean values of Air Quality Index (AQI) per day for Ozone (O3) for July 2005',
height=600,
)
# Modifying legend
fig_O3_v2.update_layout(coloraxis_colorbar=dict(
title="Air Quality Index (AQI)",
ticks="outside",
dtick=50
))
Data analysis & interactive visualizations (plotly
Data analysis & interactive visualizations (plotly
### Background This dataset deals with pollution in the U.S. It contains daily data for the four major pollutants NO2, O3, SO2 and CO during 2000 and 2010. The data set comes with 28 variables (among which each of the four pollutations represents five columns) and more than one million observations. The source of this data set is https://www.kaggle.com/sogun3/uspollution. The original data was scraped from the database of U.S. EPA : https://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/download_files.html This notebook mainly deals with pollution in the state California, since it by far has the most data points. The focus of this work is on data cleaning and visualization. Remarks and suggestions for improvement are very welcome. Please upvote if you like my work!import pandas as pd
import matplotlib.pyplot as pp
import numpy as np
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()
raw_data = pd.read_csv('../input/uspollution/pollution_us_2000_2016.csv')
raw_data.head(5)
| Unnamed: 0 | State Code | County Code | Site Num | Address | State | County | City | Date Local | NO2 Units | ... | SO2 Units | SO2 Mean | SO2 1st Max Value | SO2 1st Max Hour | SO2 AQI | CO Units | CO Mean | CO 1st Max Value | CO 1st Max Hour | CO AQI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-01 | Parts per billion | ... | Parts per billion | 3.000000 | 9.0 | 21 | 13.0 | Parts per million | 1.145833 | 4.2 | 21 | NaN |
| 1 | 1 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-01 | Parts per billion | ... | Parts per billion | 3.000000 | 9.0 | 21 | 13.0 | Parts per million | 0.878947 | 2.2 | 23 | 25.0 |
| 2 | 2 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-01 | Parts per billion | ... | Parts per billion | 2.975000 | 6.6 | 23 | NaN | Parts per million | 1.145833 | 4.2 | 21 | NaN |
| 3 | 3 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-01 | Parts per billion | ... | Parts per billion | 2.975000 | 6.6 | 23 | NaN | Parts per million | 0.878947 | 2.2 | 23 | 25.0 |
| 4 | 4 | 4 | 13 | 3002 | 1645 E ROOSEVELT ST-CENTRAL PHOENIX STN | Arizona | Maricopa | Phoenix | 2000-01-02 | Parts per billion | ... | Parts per billion | 1.958333 | 3.0 | 22 | 4.0 | Parts per million | 0.850000 | 1.6 | 23 | NaN |
5 rows × 29 columns
raw_data.columns
data = raw_data.drop(['Unnamed: 0', 'State Code', 'County Code', 'Site Num', 'Address', 'County', 'NO2 Units',
'O3 Units', 'SO2 Units', 'CO Units'], axis = 1)
data.columns
data.describe(include='all')
| State | City | Date Local | NO2 Mean | NO2 1st Max Value | NO2 1st Max Hour | NO2 AQI | O3 Mean | O3 1st Max Value | O3 1st Max Hour | O3 AQI | SO2 Mean | SO2 1st Max Value | SO2 1st Max Hour | SO2 AQI | CO Mean | CO 1st Max Value | CO 1st Max Hour | CO AQI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1746661 | 1746661 | 1746661 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 873754.000000 | 1.746661e+06 | 1.746661e+06 | 1.746661e+06 | 873338.000000 |
| unique | 47 | 144 | 5996 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | California | Not in a city | 2002-06-10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | 576142 | 138411 | 640 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | NaN | NaN | 1.282193e+01 | 2.541485e+01 | 1.173102e+01 | 2.389822e+01 | 2.612485e-02 | 3.920331e-02 | 1.017053e+01 | 3.605012e+01 | 1.870364e+00 | 4.492185e+00 | 9.664906e+00 | 7.115945 | 3.682177e-01 | 6.201067e-01 | 7.875026e+00 | 5.996595 |
| std | NaN | NaN | NaN | 9.504814e+00 | 1.599963e+01 | 7.877501e+00 | 1.516280e+01 | 1.136974e-02 | 1.534362e-02 | 4.003144e+00 | 1.978042e+01 | 2.760435e+00 | 7.679866e+00 | 6.731228e+00 | 11.937473 | 3.140231e-01 | 6.439361e-01 | 7.978844e+00 | 5.851836 |
| min | NaN | NaN | NaN | -2.000000e+00 | -2.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -2.000000e+00 | -2.000000e+00 | 0.000000e+00 | 0.000000 | -4.375000e-01 | -4.000000e-01 | 0.000000e+00 | 0.000000 |
| 25% | NaN | NaN | NaN | 5.750000e+00 | 1.300000e+01 | 5.000000e+00 | 1.200000e+01 | 1.787500e-02 | 2.900000e-02 | 9.000000e+00 | 2.500000e+01 | 2.565220e-01 | 8.000000e-01 | 5.000000e+00 | 1.000000 | 1.834580e-01 | 2.920000e-01 | 0.000000e+00 | 2.000000 |
| 50% | NaN | NaN | NaN | 1.073913e+01 | 2.400000e+01 | 9.000000e+00 | 2.300000e+01 | 2.587500e-02 | 3.800000e-02 | 1.000000e+01 | 3.300000e+01 | 9.875000e-01 | 2.000000e+00 | 8.000000e+00 | 3.000000 | 2.926250e-01 | 4.000000e-01 | 6.000000e+00 | 5.000000 |
| 75% | NaN | NaN | NaN | 1.771364e+01 | 3.570000e+01 | 2.000000e+01 | 3.300000e+01 | 3.391700e-02 | 4.800000e-02 | 1.100000e+01 | 4.200000e+01 | 2.325000e+00 | 5.000000e+00 | 1.400000e+01 | 9.000000 | 4.666670e-01 | 8.000000e-01 | 1.300000e+01 | 8.000000 |
| max | NaN | NaN | NaN | 1.395417e+02 | 2.670000e+02 | 2.300000e+01 | 1.320000e+02 | 9.508300e-02 | 1.410000e-01 | 2.300000e+01 | 2.180000e+02 | 3.216250e+02 | 3.510000e+02 | 2.300000e+01 | 200.000000 | 7.508333e+00 | 1.990000e+01 | 2.300000e+01 | 201.000000 |
data.isnull().sum()
data_no_mv = data.dropna(axis=0)
data_no_dupl = data_no_mv.drop_duplicates()
data_no_duplindex_col=0
data_no_dupl.describe(include='all')
| State | City | Date Local | NO2 Mean | NO2 1st Max Value | NO2 1st Max Hour | NO2 AQI | O3 Mean | O3 1st Max Value | O3 1st Max Hour | O3 AQI | SO2 Mean | SO2 1st Max Value | SO2 1st Max Hour | SO2 AQI | CO Mean | CO 1st Max Value | CO 1st Max Hour | CO AQI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 435551 | 435551 | 435551 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 | 435551.000000 |
| unique | 47 | 144 | 5996 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | California | Not in a city | 2013-04-17 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | 143967 | 34609 | 102 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | NaN | NaN | 12.823625 | 25.413372 | 11.728165 | 23.896749 | 0.026118 | 0.039195 | 10.167882 | 36.034584 | 1.883559 | 5.257113 | 8.785892 | 7.123531 | 0.369841 | 0.529403 | 6.254691 | 5.995429 |
| std | NaN | NaN | NaN | 9.510261 | 16.000778 | 7.876548 | 15.162184 | 0.011367 | 0.015334 | 4.002226 | 19.741178 | 2.764168 | 8.956407 | 6.800267 | 11.945473 | 0.316341 | 0.509932 | 7.840592 | 5.845054 |
| min | NaN | NaN | NaN | -2.000000 | -2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -2.000000 | -2.000000 | 0.000000 | 0.000000 | -0.437500 | -0.400000 | 0.000000 | 0.000000 |
| 25% | NaN | NaN | NaN | 5.750000 | 13.000000 | 5.000000 | 12.000000 | 0.017875 | 0.029000 | 9.000000 | 25.000000 | 0.266667 | 1.000000 | 3.000000 | 1.000000 | 0.191667 | 0.200000 | 0.000000 | 2.000000 |
| 50% | NaN | NaN | NaN | 10.739130 | 24.000000 | 9.000000 | 23.000000 | 0.025875 | 0.038000 | 10.000000 | 33.000000 | 1.000000 | 2.000000 | 8.000000 | 3.000000 | 0.295833 | 0.400000 | 2.000000 | 5.000000 |
| 75% | NaN | NaN | NaN | 17.717029 | 35.700000 | 20.000000 | 33.000000 | 0.033875 | 0.048000 | 11.000000 | 42.000000 | 2.333333 | 6.000000 | 13.000000 | 9.000000 | 0.470833 | 0.700000 | 10.000000 | 8.000000 |
| max | NaN | NaN | NaN | 139.541667 | 267.000000 | 23.000000 | 132.000000 | 0.095083 | 0.141000 | 23.000000 | 218.000000 | 321.625000 | 351.000000 | 23.000000 | 200.000000 | 7.508333 | 15.500000 | 23.000000 | 201.000000 |
import seaborn as sb
from pylab import *
# Show all distribution plots simultaneously in four subplots
sb.set(rc={"figure.figsize": (15, 10)})
subplot(2,2,1)
ax = sb.kdeplot(data_no_dupl['CO AQI'],shade=True)
subplot(2,2,2)
ax = sb.kdeplot(data_no_dupl['SO2 AQI'], shade=True)
subplot(2,2,3)
ax = sb.kdeplot(data_no_dupl['NO2 AQI'], shade=True)
subplot(2,2,4)
ax = sb.kdeplot(data_no_dupl['O3 AQI'], shade=True)
pp.show()
# Define 99% quantile
q1 = data_no_dupl['CO AQI'].quantile(0.99)
q1
# Dropping observations that are greater than the 99% quantile, which lies above the value 33.
CO_outliers = data_no_dupl[data_no_dupl['CO AQI'] > q1]
data_no_dupl = data_no_dupl.drop(CO_outliers.index, axis= 0)
# Show the new data distribution after outliers were removed.
sb.distplot(data_no_dupl['CO AQI'])
# Define 99% quantile
q2 = data_no_dupl['SO2 AQI'].quantile(.99)
q2
# Dropping observations that are greater than the 99% quantile, which lies above the value 69.
SO2_outliers = data_no_dupl[data_no_dupl['SO2 AQI']>q2]
data_no_dupl = data_no_dupl.drop(SO2_outliers.index)
# Show the new data distribution after outliers were removed.
sb.distplot(data_no_dupl['SO2 AQI'])
q3 = data_no_dupl['NO2 AQI'].quantile(.99)
q3
# Dropping observations that are greater than the 99% quantile, which lies above the value 70.
NO2_outliers = data_no_dupl[data_no_dupl['NO2 AQI']>q3]
data_no_dupl = data_no_dupl.drop(NO2_outliers.index)
# Show the new data distribution after outliers were removed.
sb.distplot(data_no_dupl['NO2 AQI'])
# Define 99% quantile
q4 = data_no_dupl['O3 AQI'].quantile(.99)
q4
# Dropping observations that are greater than the 99% quantile, which lies above the value 119.
O3_outliers = data_no_dupl[data_no_dupl['O3 AQI']>q4]
data_no_dupl = data_no_dupl.drop(O3_outliers.index)
# Show the new data distribution after outliers were removed.
sb.distplot(data_no_dupl['O3 AQI'])
data_no_dupl['State'].unique()
# Remove all rows that contain string 'Country Of Mexico' in column 'State'
data_var1 = data_no_dupl[~data_no_dupl.State.str.contains("Country Of Mexico")]
data_var1['City'].unique()
import datetime as dt
# make copy of data first to avoid ~SettingWithCopyWarning~
data_var1 = data_var1[data_var1['Date Local'].notnull()].copy()
data_var1['Year_Month'] = pd.to_datetime(data_var1['Date Local']).dt.strftime('%Y-%m') #Year-Month
data_var1['Year'] = pd.to_datetime(data_var1['Date Local']).dt.strftime('%Y') #Year
data_var1['Month'] = pd.to_datetime(data_var1['Date Local']).dt.strftime('%m') #Year
# create sub data set with relevant variables only
pollution_df = data_var1[['Year_Month','Year','Month','State','City','NO2 AQI','O3 AQI','SO2 AQI','CO AQI']]
pollution_CA = pollution_df [pollution_df['State'] == 'California'].reset_index(drop=True)
pollution_CA.head()
| Year_Month | Year | Month | State | City | NO2 AQI | O3 AQI | SO2 AQI | CO AQI | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2000-01 | 2000 | 01 | California | Concord | 25 | 25 | 3.0 | 9.0 |
| 1 | 2000-01 | 2000 | 01 | California | Concord | 28 | 27 | 3.0 | 9.0 |
| 2 | 2000-01 | 2000 | 01 | California | Concord | 38 | 12 | 6.0 | 22.0 |
| 3 | 2000-01 | 2000 | 01 | California | Concord | 42 | 9 | 6.0 | 23.0 |
| 4 | 2000-01 | 2000 | 01 | California | Concord | 36 | 19 | 6.0 | 19.0 |
CA_10per = pollution_CA.sample(frac=0.5)
CA_10per = CA_10per.sort_values('Year_Month').reset_index(drop=True)
#import chart_studio.tools as tls
import plotly.express as px
import cufflinks as cf
import plotly.graph_objs as go
# Create two new pd data frames
AQI_time= CA_10per[['Year_Month','NO2 AQI','O3 AQI','SO2 AQI','CO AQI']] # all four AQIs incl. date
AQI = AQI_time.iloc[:,1:] # all four AQIs only
#This interactive plot I had to toggle because there is an error when loading the chart_studio package.
#It works fine in other environments (e.g. Jupyter), but import the package here gives me the error
# 'ModuleNotFoundError: No module named 'chart_studio''. Uninstalling and reinstalling the package did also not solve it.
#Any help on this issue is appreciated!
#AQI.iplot(kind='histogram', subplots= True, shape = (1,4),
# xaxis_title="Value", yaxis_title = 'Count',
# color=["red", "goldenrod", "#00D", 'lightgreen'],
# title= {'text': "Distribution of AQI values per AQI category in CA"},
# filename='US Pollution-CA-AQI-distrib-multiple-histo')
sb.pairplot(AQI)
sb.jointplot(x=AQI_time["NO2 AQI"], y=AQI_time["CO AQI"], kind='kde',color='blue', xlim=(0,50),ylim=(0,15))
plt.show()
sb.jointplot(x=AQI_time["O3 AQI"], y=AQI_time["CO AQI"], kind='kde',color='green', xlim=(0,50),ylim=(0,15))
plt.show()
fig, (ax1, ax2,ax3,ax4) = pp.subplots(4,1, figsize = (20,20))
for ax in ax1, ax2,ax3,ax4:
ax.set(xlabel='Date')
ax.set(ylabel='Value')
ax1.bar(AQI_time['Year_Month'],AQI_time['CO AQI'], color = 'purple')
ax1.set_title('CO AQI')
ax1.set_xticks(['2000-06','2001-06','2002-06','2003-06','2004-06','2005-06','2006-06','2007-06','2008-06','2009-06','2010-06','2011-06', '2012-06','2013-06','2014-06','2015-06','2016-06'])
ax2.bar(AQI_time['Year_Month'], AQI_time['SO2 AQI'], color = 'red')
ax2.set_title('SO2 AQI')
ax2.set_xticks(['2000-06','2001-06','2002-06','2003-06','2004-06','2005-06','2006-06','2007-06','2008-06','2009-06','2010-06','2011-06', '2012-06','2013-06','2014-06','2015-06','2016-06'])
ax3.bar(AQI_time['Year_Month'],AQI_time['NO2 AQI'], color = 'green')
ax3.set_title('NO2 AQI')
ax3.set_xticks(['2000-06','2001-06','2002-06','2003-06','2004-06','2005-06','2006-06','2007-06','2008-06','2009-06','2010-06','2011-06', '2012-06','2013-06','2014-06','2015-06','2016-06'])
ax4.bar(AQI_time['Year_Month'],AQI_time['O3 AQI'], color = 'blue')
ax4.set_title('O3 AQI')
ax4.set_xticks(['2000-06','2001-06','2002-06','2003-06','2004-06','2005-06','2006-06','2007-06','2008-06','2009-06','2010-06','2011-06', '2012-06','2013-06','2014-06','2015-06','2016-06'])
pp.show()
AQI_time_grouped =AQI_time.groupby(['Year_Month']).mean().plot()
# All four AQI categories over time
fig = go.Figure()
fig.add_trace(go.Scatter(x=CA_10per['Year_Month'], y=CA_10per['NO2 AQI'],
mode='lines', name='NO2 AQI', opacity=0.7))
fig.add_trace(go.Scatter(x=CA_10per['Year_Month'], y=CA_10per['O3 AQI'],
mode='lines', name='O3 AQI', opacity=0.7))
fig.add_trace(go.Scatter(x=CA_10per['Year_Month'], y=CA_10per['SO2 AQI'],
mode='lines', name='SO2 AQI', opacity=1.0))
fig.add_trace(go.Scatter(x=CA_10per['Year_Month'], y=CA_10per['CO AQI'],
mode='markers', name='CO AQI', opacity=0.6))
fig.update_layout(legend_title_text = "AQI categories",
title='AQI categories 2000-2010')
fig.update_yaxes(title_text="Value (in PPM/PPB)")
fig.update_xaxes(title_text="Time")
fig.show()
fig = px.scatter(CA_10per,x='O3 AQI', y='NO2 AQI',
animation_frame='Year', animation_group='City',color='City',
range_y=[0, 90], range_x=[0, 100])
fig.show()